Description

Analyze differentially co-expressed genes.

Based on this tutorial: https://deisygysi.github.io/teaching/minipeb2021/

library(data.table)
library(dplyr)
library(igraph)
library(ggalluvial)
library(ggplot2)
require(magrittr)
library(tidyr)
set.seed(1510)
options(bitmapType='cairo')
`%ni%` <- Negate(`%in%`)

Load data

plots_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/plots"
tables_dir = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/out/tables"

Prepare disease genes information dataset

Prepare the disease-gene associations:

disease_genes_info_file = "/home/j.aguirreplans/Projects/Scipher/SampleSize/data/disease_genes/disease_genes_info_2022.csv"

if(!(file.exists(disease_genes_info_file))){
  gene_associations_dir = '/work/ccnr/j.aguirreplans/data/gene_associations/data/out'
  disease_genes_file = paste(gene_associations_dir, 'GDA_Filtered_04042022.csv', sep='/')
  GDA = fread(disease_genes_file) %>% unique()
  GDA$HGNC_Symbol = toupper(GDA$HGNC_Symbol)
  GDA[is.na(GDA)]<-0
  GDA$NewName = tolower(GDA$NewName)
  GDA$DescriptorName = tolower(GDA$DescriptorName)
  GDA$MESHID = toupper(GDA$MESHID)
  GDA$HGNC_Symbol = toupper(GDA$HGNC_Symbol)
  GDA %<>% 
    filter(Strong > 0 | 
             Weak > 0 |
             Incompatible > 0) %>%
    mutate(DiseaseName = NewName) %>%
    filter(DescriptorName %ni% c("behavioral disciplines and activities")) %>%
    mutate(MESHID_remove = stringr::str_detect(TreeNumber, pattern = "C22|C25|F04")) %>%
    mutate(MESHID_remove = ifelse(TreeNumber == "F01", TRUE,MESHID_remove)) %>%
    filter(!(MESHID_remove)) %>%
    select(DiseaseName, HGNC_Symbol, DescriptorName) %>% 
    unique()
  
  # Make columns without special characters  
  GDA$DiseaseName.no.sp.char <- gsub(' ', '.', gsub(', ', '.', gsub('-', '.', gsub('[\\(\\)]', '', GDA$DiseaseName))))
  GDA$DescriptorName.no.sp.char <- gsub(' ', '.', gsub(', ', '.', gsub('-', '.', gsub('[\\(\\)]', '', GDA$DescriptorName))))
  
  # Write output file
  GDA %>% fwrite(disease_genes_info_file)
} else {
  GDA = fread(disease_genes_info_file)
}

head(GDA)
##                 DiseaseName HGNC_Symbol
## 1:             hepatomegaly        A1BG
## 2:            schizophrenia        A1BG
## 3:      acute kidney injury         A2M
## 4:       adenoma liver cell         A2M
## 5:        alzheimer disease         A2M
## 6: carcinoma hepatocellular         A2M
##                                                                     DescriptorName
## 1:           digestive system diseases|pathological conditions, signs and symptoms
## 2:                                                                mental disorders
## 3: female urogenital diseases and pregnancy complications|male urogenital diseases
## 4:                                             digestive system diseases|neoplasms
## 5:                                        mental disorders|nervous system diseases
## 6:                                             digestive system diseases|neoplasms
##      DiseaseName.no.sp.char
## 1:             hepatomegaly
## 2:            schizophrenia
## 3:      acute.kidney.injury
## 4:       adenoma.liver.cell
## 5:        alzheimer.disease
## 6: carcinoma.hepatocellular
##                                                          DescriptorName.no.sp.char
## 1:            digestive.system.diseases|pathological.conditions.signs.and.symptoms
## 2:                                                                mental.disorders
## 3: female.urogenital.diseases.and.pregnancy.complications|male.urogenital.diseases
## 4:                                             digestive.system.diseases|neoplasms
## 5:                                        mental.disorders|nervous.system.diseases
## 6:                                             digestive.system.diseases|neoplasms

There are 65425 disease-gene associations (870 diseases, 14833 genes).

Analyze breast carcinoma

Filter disease-gene associations by the disease “breast neoplasms”:

disease_name = "breast.neoplasms"
GDA_breast = GDA %>% select("DiseaseName.no.sp.char", "HGNC_Symbol") %>% filter(DiseaseName.no.sp.char == disease_name)
dim(GDA_breast)
## [1] 1905    2
head(GDA_breast)
##    DiseaseName.no.sp.char HGNC_Symbol
## 1:       breast.neoplasms       ABCB1
## 2:       breast.neoplasms       ABCC1
## 3:       breast.neoplasms       ABCG2
## 4:       breast.neoplasms        ABL1
## 5:       breast.neoplasms        ACHE
## 6:       breast.neoplasms       ACTA2

Compile the files of the differential co-expression analysis from CODINA classifying the nodes:

input_dir = "/work/ccnr/j.aguirreplans/Scipher/SampleSize/differential_coexpression_analysis/TCGA-BRCA_Breast.Mammary.Tissue"
result_files = list.files(input_dir)

cols = c("file_name", "size", "rep_D", "rep_N")
file_info_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))

for(file_name in result_files){
  info_file = gsub(".txt","",file_name)
  file_split1 = strsplit(info_file, split="_")[[1]]
  if(file_split1[1] == "diffanalysis"){
    type_analysis = file_split1[2]
    if(type_analysis == "nodes"){
      pval = tail(file_split1, n=1)
      info_file = gsub(paste("diffanalysis_", type_analysis, "_", sep=""),"",info_file)
      info_file = gsub(paste("_pval_", pval, sep=""),"",info_file)
      file_split2 = strsplit(info_file, split="___")[[1]]
      file_D_split = strsplit(gsub(".net","",file_split2[1]), split="_")[[1]]
      file_N_split = strsplit(gsub(".net","",file_split2[2]), split="_")[[1]]
      method = file_D_split[1]
      size = as.numeric(file_D_split[(length(file_D_split)-2)])
      r_D = as.numeric(file_D_split[(length(file_D_split))])
      r_N = as.numeric(file_N_split[(length(file_N_split))])
      dataset_D = file_D_split[(length(file_D_split)-4)]
      dataset_N = file_N_split[(length(file_N_split)-4)]
      file_info_df <- rbind(file_info_df, data.frame(file_name=file_name, size=size, rep_D=r_D, rep_N=r_N))
    }
  }
}

head(file_info_df)
##                                                                                                                                      file_name
## 1 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_1.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_1.net_pval_0.05.txt
## 2 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_3.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_3.net_pval_0.05.txt
## 3 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_4.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_4.net_pval_0.05.txt
## 4 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_100_rep_5.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_100_rep_5.net_pval_0.05.txt
## 5 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_120_rep_2.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_120_rep_2.net_pval_0.05.txt
## 6 diffanalysis_nodes_pearson_tcga_TCGA-BRCA_size_120_rep_3.net___pearson_RNAseq_samples_Breast.Mammary.Tissue_size_120_rep_3.net_pval_0.05.txt
##   size rep_D rep_N
## 1  100     1     1
## 2  100     3     3
## 3  100     4     4
## 4  100     5     5
## 5  120     2     2
## 6  120     3     3

Read the files of the differential co-expression analysis from CODINA classifying the nodes, and put them together in a single table:

pval_correction_field = "pval_Phi_Tilde.adj.fdr"
pval_threshold = 0.05
cols = c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N")
nodes_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
sizes_list = sort(unique(file_info_df$size))

for(x in seq(sizes_list)){
  size = sizes_list[x]
  #print(size)
  same_size_files_df = file_info_df %>% filter(size==!!size) %>% arrange(rep_D, rep_N)
  input_files = same_size_files_df$file_name
  # same_size_info_df = data.frame()
  for(input_file in input_files){
    r_D = (same_size_files_df %>% filter(file_name == input_file))$rep_D
    r_N = (same_size_files_df %>% filter(file_name == input_file))$rep_N
    individual_df = fread(paste(input_dir, input_file, sep="/"))
    node_results_filt_df = individual_df %>%
      select(Node, Phi_tilde, !!as.symbol(pval_correction_field)) %>% 
      unique() %>% 
      rename("pval" = !!as.symbol(pval_correction_field)) %>% 
      left_join(GDA_breast, by=c("Node"="HGNC_Symbol"))
    if(nrow(nodes_df) == 0){
      nodes_df = cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N))
    } else {
      nodes_df = nodes_df %>% full_join(cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N)), by=c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N"))
    }
  }
}

# Define Phi_name
nodes_df$Phi_name = ifelse(nodes_df$Phi_tilde == "a", "common", ifelse(nodes_df$Phi_tilde == "b.D", "different", ifelse(nodes_df$Phi_tilde == "g.D", "disease-specific", ifelse(nodes_df$Phi_tilde == "g.N", "normal-specific", "undefined"))))

head(nodes_df)
##     Node DiseaseName.no.sp.char Phi_tilde         pval size rep_D rep_N
## 1:  AAAS                   <NA>       g.N 5.213856e-12   20     1     1
## 2:  AACS                   <NA>       g.N 9.553415e-07   20     1     1
## 3: AAGAB                   <NA>         U 3.745062e-01   20     1     1
## 4:  AAR2                   <NA>       g.D 7.709075e-03   20     1     1
## 5:  AARD                   <NA>         U 3.745062e-01   20     1     1
## 6: AARS2                   <NA>       g.N 1.835766e-02   20     1     1
##            Phi_name
## 1:  normal-specific
## 2:  normal-specific
## 3:        undefined
## 4: disease-specific
## 5:        undefined
## 6:  normal-specific

We consider as “undefined” both the genes that are classified as U, but also the ones that are not significantly classified in any of the categories. To do so, we need create an empty table with all considered genes for all sizes and repetitions, and fill it with the data that we already have:

reps = sort(unique(paste(nodes_df$size, nodes_df$rep_D, nodes_df$rep_N, sep="-")))
nodes_empty_df = expand.grid(sort(unique(nodes_df$Node)), reps)
nodes_empty_df = nodes_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(nodes_empty_df) = c("Node", "size", "rep_D", "rep_N")
nodes_empty_df = nodes_empty_df %>% left_join((nodes_df %>% select(Node, DiseaseName.no.sp.char) %>% unique()), by="Node") %>% arrange(Node, size, rep_D, rep_N)
nodes_expanded_df = nodes_df %>% full_join(nodes_empty_df, by = c("Node", "DiseaseName.no.sp.char", "size", "rep_D", "rep_N")) %>% arrange(Node, size, rep_D)
nodes_expanded_df = nodes_expanded_df %>%
  mutate(Phi_tilde = replace(Phi_tilde, is.na(Phi_tilde), "U")) %>%
  mutate(Phi_name = replace(Phi_name, is.na(Phi_name), "undefined"))
nodes_expanded_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes.txt", sep="/")
nodes_expanded_df %>% fwrite(nodes_expanded_file)
head(nodes_expanded_df)
##    Node DiseaseName.no.sp.char Phi_tilde         pval size rep_D rep_N
## 1:  A2M       breast.neoplasms         U           NA   20     1     1
## 2:  A2M       breast.neoplasms         U           NA   20     2     2
## 3:  A2M       breast.neoplasms       g.D 5.308776e-07   20     3     3
## 4:  A2M       breast.neoplasms         U           NA   20     4     4
## 5:  A2M       breast.neoplasms         U           NA   20     5     5
## 6:  A2M       breast.neoplasms       g.N 1.259754e-06   40     1     1
##            Phi_name
## 1:        undefined
## 2:        undefined
## 3: disease-specific
## 4:        undefined
## 5:        undefined
## 6:  normal-specific

We count how many times each gene category appears in each sample size:

# Count for each gene and size the number of times that each category is predicted
nodes_counted_df = nodes_expanded_df %>% 
  group_by(Node, DiseaseName.no.sp.char, Phi_tilde, Phi_name, size) %>% 
  summarize(n_phi = n()) %>% 
  ungroup()  %>% 
  arrange(Node, size)
## `summarise()` has grouped output by 'Node', 'DiseaseName.no.sp.char',
## 'Phi_tilde', 'Phi_name'. You can override using the `.groups` argument.
nodes_counted_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes_counted.txt", sep="/")
nodes_counted_df %>% fwrite(nodes_counted_file)
head(nodes_counted_df)
## # A tibble: 6 × 6
##   Node  DiseaseName.no.sp.char Phi_tilde Phi_name          size n_phi
##   <chr> <chr>                  <chr>     <chr>            <dbl> <int>
## 1 A2M   breast.neoplasms       g.D       disease-specific    20     1
## 2 A2M   breast.neoplasms       U         undefined           20     4
## 3 A2M   breast.neoplasms       g.N       normal-specific     40     1
## 4 A2M   breast.neoplasms       U         undefined           40     4
## 5 A2M   breast.neoplasms       g.D       disease-specific    60     3
## 6 A2M   breast.neoplasms       g.N       normal-specific     60     1

We select the gene category that appears more for in the different repetitions of each sample size:

# Select the for each gene and size the node type that occurs more times in the different repetitions
nodes_filtered_df = nodes_counted_df %>%
  group_by(Node, DiseaseName.no.sp.char, size) %>%
  filter(n_phi == max(n_phi)) %>% 
  arrange(Node, size) %>% 
  sample_n(1) %>% # Choose randomly cases in which there is the same number of node types 
  ungroup()
 
nodes_filtered_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_nodes_filtered.txt", sep="/")
nodes_filtered_df %>% fwrite(nodes_filtered_file)
head(nodes_filtered_df)
## # A tibble: 6 × 6
##   Node  DiseaseName.no.sp.char Phi_tilde Phi_name          size n_phi
##   <chr> <chr>                  <chr>     <chr>            <dbl> <int>
## 1 A2M   breast.neoplasms       U         undefined           20     4
## 2 A2M   breast.neoplasms       U         undefined           40     4
## 3 A2M   breast.neoplasms       g.D       disease-specific    60     3
## 4 A2M   breast.neoplasms       g.N       normal-specific     80     3
## 5 A2M   breast.neoplasms       g.D       disease-specific   100     2
## 6 A2M   breast.neoplasms       g.N       normal-specific    120     2

We count the number of disease genes in each category for each size:

counts_categories_empty_df = expand.grid(sort(unique(nodes_expanded_df$Phi_name)), reps)
counts_categories_empty_df = counts_categories_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(counts_categories_empty_df) = c("Phi_name", "size", "rep_D", "rep_N")
counts_categories_df = nodes_expanded_df %>% 
  filter(!(is.na(DiseaseName.no.sp.char))) %>% 
  group_by(Phi_name, size, rep_D, rep_N) %>% count() %>% rename("n_disease"="n") %>% arrange(size)
counts_categories_df = counts_categories_df %>% full_join(counts_categories_empty_df, by = c("Phi_name", "size", "rep_D", "rep_N")) %>% arrange(Phi_name, size, rep_D, rep_N)
counts_categories_df$n_disease = ifelse(is.na(counts_categories_df$n_disease), 0, counts_categories_df$n_disease)
counts_categories_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_category_counts.txt", sep="/")
counts_categories_df %>% fwrite(counts_categories_file)
head(counts_categories_df)
## # A tibble: 6 × 5
## # Groups:   Phi_name, size, rep_D, rep_N [6]
##   Phi_name  size rep_D rep_N n_disease
##   <chr>    <dbl> <dbl> <dbl>     <dbl>
## 1 common      20     1     1         0
## 2 common      20     2     2         0
## 3 common      20     3     3         0
## 4 common      20     4     4         0
## 5 common      20     5     5         0
## 6 common      40     1     1         0

We plot the number of disease genes in each category:

custom_palette = c(
                   "#619CFF", #common
                   "#F8766D", #disease-specific 
                   "#00BA38", #normal-specific
                   "#808080", #undefined
                   "#C77CFF" #different
                   )

counts_categories_df %>% 
  group_by(Phi_name, size) %>%
  summarize(mean_disease = mean(n_disease), sd_disease = sd(n_disease), max_disease=max(n_disease), min_disease=min(n_disease)) %>%
  ggplot(aes(x = size, y = mean_disease, fill = Phi_name)) + 
  geom_line(aes(col = Phi_name)) +
  geom_ribbon(aes(ymin=min_disease, ymax=max_disease), alpha=0.2) +
  xlab("Number of samples") +
  ylab("Number of disease genes") +
  scale_fill_manual(values = custom_palette) +
  scale_color_manual(values = custom_palette) +
  guides(col=guide_legend(title="Gene category"), fill=guide_legend(title="Gene category")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.

plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_counts_disease_genes.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

We create the alluvium plot showing the flow between gene categories across sample size:

nodes_filtered_df$Phi_name <- as.factor(nodes_filtered_df$Phi_name)
nodes_filtered_df$size = as.character(nodes_filtered_df$size)
levels(nodes_filtered_df$Phi_name) = sort(unique(nodes_filtered_df$Phi_name))
nodes_filtered_df$size <- factor(nodes_filtered_df$size, levels=as.character(sort(unique(as.integer(nodes_filtered_df$size)))))

custom_palette = c(
                   "#619CFF", #common
                   "#F8766D", #disease-specific 
                   "#00BA38", #normal-specific
                   "#808080", #undefined
                   "#C77CFF" #different
                   )
plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_change_disease_genes.png", sep="/")
nodes_filtered_df %>% 
  filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
  filter(!(is.na(DiseaseName.no.sp.char))) %>%
  ggplot(aes(x = size, stratum = Phi_name, alluvium = Node, 
             fill = Phi_name, label = Phi_name)) +
  scale_fill_manual(values = custom_palette) +
  scale_color_manual(values = custom_palette) +
  geom_flow(stat = "alluvium", lode.guidance = "frontback") +
  geom_stratum() +
  xlab("Number of samples") +
  ylab("Number of disease genes") +
  guides(fill=guide_legend(title="Gene category")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), 
        axis.title = element_text(size = 14, face="bold"), 
        axis.text = element_text(size = 12), 
        #legend.position = "bottom",
        legend.text = element_text(size = 12), 
        legend.title=element_text(size=14, face="bold"))

ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

We check the behavior specific genes associated with breast neoplasms:

# Nodes of breast neoplasms: https://www.nationalbreastcancer.org/other-breast-cancer-genes
# https://www.cancer.org/cancer/breast-cancer/risk-and-prevention/breast-cancer-risk-factors-you-cannot-change.html#:~:text=BRCA1%20and%20BRCA2%3A%20The%20most,which%20can%20lead%20to%20cancer.
nodes_to_follow = c("BRCA1", "PALB2", "CHEK2", "CDH1", "PTEN", "STK11", "TP53") ### WE ARE MISSING BRCA2!!!!
for (important_node in nodes_to_follow){
  # Filter results by important node
  important_df = nodes_expanded_df %>% 
    filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
    filter(!(is.na(DiseaseName.no.sp.char))) %>%
    filter(Node == important_node) %>%
    select(Phi_name, size, rep_D, rep_N) %>%
    group_by(Phi_name, size) %>% 
    summarize(n_evidences = n()) %>% 
    ungroup()
  # Define levels
  important_df$Phi_name <- as.factor(important_df$Phi_name)
  important_df$size = as.character(important_df$size)
  levels(important_df$Phi_name) = sort(unique(important_df$Phi_name))
  important_df$size <- factor(important_df$size, levels=as.character(sort(unique(as.integer(important_df$size)))))
  # Define palette
  if (levels(important_df$Phi_name) == c("disease-specific", "normal-specific", "undefined")){
    personalized_palette = c("#F8766D", "#00BA38", "#808080")
  } else if (levels(important_df$Phi_name) == c("normal-specific", "undefined")){
    personalized_palette = c("#00BA38", "#808080")
  } else if (levels(important_df$Phi_name) == c("common", "normal-specific", "undefined")){
    personalized_palette = c("#619CFF", "#00BA38", "#808080")
  } else if (levels(important_df$Phi_name) == c("normal-specific")){
    personalized_palette = c("#00BA38")
  } else {
    personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080", "#C77CFF")
  }
  # Plot
  important_node_plot = important_df %>%
    ggplot(aes(fill=Phi_name, y=n_evidences, x=size)) + 
      geom_bar(position="fill", stat="identity", width = 0.7, col="black") +
      #geom_bar(position = position_dodge(), stat="identity", col="black", width = 0.6) +
    xlab("Number of samples") +
    ylab("Fraction of evidences") +
    scale_fill_manual(values = personalized_palette) +
    guides(fill=guide_legend(title="Gene category")) +
    theme_linedraw() +
    theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
  plot_file = paste(plots_dir, paste("codina_tcga_brca_gtex_breast_important_", important_node, ".png", sep=""), sep="/")
  ggsave(
    plot_file,
    plot=important_node_plot,
    dpi = 1200,
    width = 9000,
    height = 6000,
    units = c("px")
  )
  print(important_node_plot)
}
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in levels(important_df$Phi_name) == c("normal-specific", "undefined"):
## longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("common", "normal-specific", :
## the condition has length > 1 and only the first element will be used

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used

## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.
## Warning in levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : longer object length is not a multiple of shorter object length
## Warning in if (levels(important_df$Phi_name) == c("disease-specific", "normal-
## specific", : the condition has length > 1 and only the first element will be
## used
## Warning in if (levels(important_df$Phi_name) == c("normal-specific",
## "undefined")) {: the condition has length > 1 and only the first element will be
## used

Calculate the fraction of unchanged / changed genes in each category for each sample size:

# Define output table
cols = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr", "size.curr")
change_of_category_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
# Remove levels
nodes_filtered_df$size = as.numeric(as.character(nodes_filtered_df$size))
# For each size, compare nodes with the ones of previous size
for (x in seq(sizes_list)){
  size = sizes_list[x]
  nodes_filtered_by_selected_size = nodes_filtered_df %>%
    filter((!(is.na(DiseaseName.no.sp.char))) & (size == !!size)) %>%
    select(Node, Phi_name, size)
  if (x == 1){
    previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
  } else {
    nodes_filtered_compared_by_sizes = previous_nodes_filtered %>%
      inner_join(nodes_filtered_by_selected_size, by=("Node")) %>%
      rename(c("Phi_name.prev"="Phi_name.x", "Phi_name.curr"="Phi_name.y", "size.prev"="size.x", "size.curr"="size.y"))
    change_of_category_df = rbind(change_of_category_df, nodes_filtered_compared_by_sizes)
    previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
  }
}

# Detect changes
change_of_category_df$transition_type = ifelse(change_of_category_df$Phi_name.prev == change_of_category_df$Phi_name.curr, "stable", "change")
change_of_category_df$transition_name = paste(change_of_category_df$Phi_name.prev, change_of_category_df$Phi_name.curr, sep=" / ")

# Print table
change_of_category_file = paste(tables_dir, "codina_tcga_brca_gtex_breast_changes.txt", sep="/")
change_of_category_df %>% fwrite(change_of_category_file)
head(change_of_category_df)
##     Node Phi_name.prev size.prev    Phi_name.curr size.curr transition_type
## 1    A2M     undefined        20        undefined        40          stable
## 2  ABCA3     undefined        20        undefined        40          stable
## 3 ABCB10     undefined        20 disease-specific        40          change
## 4  ABCB8     undefined        20        undefined        40          stable
## 5  ABCC1     undefined        20        undefined        40          stable
## 6   ABL1     undefined        20        undefined        40          stable
##                transition_name
## 1        undefined / undefined
## 2        undefined / undefined
## 3 undefined / disease-specific
## 4        undefined / undefined
## 5        undefined / undefined
## 6        undefined / undefined
total_stable_change_genes_df = change_of_category_df %>%
  group_by(size.prev) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>% 
  group_by(size.prev, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.prev', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.prev = "all"

categories_stable_change_genes_df = change_of_category_df %>%
  group_by(size.prev) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  filter(transition_type == "stable") %>%
  select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>% 
  group_by(Phi_name.prev, size.prev, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.prev', 'size.prev',
## 'transition_type'. You can override using the `.groups` argument.
rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
  filter(transition_type == "stable") %>%
  ggplot(aes(x = size.prev, y = frac_genes, col = Phi_name.prev)) + 
  geom_line(aes(size = Phi_name.prev)) +
  xlab("Number of samples") +
  ylab("Fraction of stable genes") +
  scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5)) +
  scale_color_manual(values = c("#CD9600", "#619CFF", "#F8766D", "#00BA38", "#808080")) +
  guides(col=guide_legend(title="Gene category"), size=FALSE) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

plot_file = paste(plots_dir, "codina_tcga_brca_gtex_breast_stable_genes.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

Analyze rheumatoid arthritis

Filter disease-gene associations by the disease “arthritis rheumatoid”:

disease_name = "arthritis.rheumatoid"
GDA_ra = GDA %>% select("DiseaseName.no.sp.char", "HGNC_Symbol") %>% filter(DiseaseName.no.sp.char == disease_name)
dim(GDA_ra)
## [1] 419   2
head(GDA_ra)
##    DiseaseName.no.sp.char HGNC_Symbol
## 1:   arthritis.rheumatoid       ABCB1
## 2:   arthritis.rheumatoid       ABCC2
## 3:   arthritis.rheumatoid       ABCC3
## 4:   arthritis.rheumatoid       ABCC4
## 5:   arthritis.rheumatoid       ABCC5
## 6:   arthritis.rheumatoid       ABCG2

Compile the files of the differential co-expression analysis from CODINA:

input_dir = "/work/ccnr/j.aguirreplans/Scipher/SampleSize/differential_coexpression_analysis/complete.dataset_Whole.Blood"
result_files = list.files(input_dir)

cols = c("file_name", "size", "rep_D", "rep_N")
file_info_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))

for(file_name in result_files){
  info_file = gsub(".txt","",file_name)
  file_split1 = strsplit(info_file, split="_")[[1]]
  if(file_split1[1] == "diffanalysis"){
    type_analysis = file_split1[2]
    if(type_analysis == "nodes"){
      pval = tail(file_split1, n=1)
      info_file = gsub(paste("diffanalysis_", type_analysis, "_", sep=""),"",info_file)
      info_file = gsub(paste("_pval_", pval, sep=""),"",info_file)
      file_split2 = strsplit(info_file, split="___")[[1]]
      file_D_split = strsplit(gsub(".net","",file_split2[1]), split="_")[[1]]
      file_N_split = strsplit(gsub(".net","",file_split2[2]), split="_")[[1]]
      method = file_D_split[1]
      size = as.numeric(file_D_split[(length(file_D_split)-2)])
      r_D = as.numeric(file_D_split[(length(file_D_split))])
      r_N = as.numeric(file_N_split[(length(file_N_split))])
      dataset_D = file_D_split[(length(file_D_split)-4)]
      dataset_N = file_N_split[(length(file_N_split)-4)]
      file_info_df <- rbind(file_info_df, data.frame(file_name=file_name, size=size, rep_D=r_D, rep_N=r_N))
    }
  }
}

head(file_info_df)
##                                                                                                                            file_name
## 1 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_1.net___pearson_gtex_Whole.Blood_size_100_rep_1.net_pval_0.05.txt
## 2 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_2.net___pearson_gtex_Whole.Blood_size_100_rep_2.net_pval_0.05.txt
## 3 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_3.net___pearson_gtex_Whole.Blood_size_100_rep_3.net_pval_0.05.txt
## 4 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_4.net___pearson_gtex_Whole.Blood_size_100_rep_4.net_pval_0.05.txt
## 5 diffanalysis_nodes_pearson_scipher_complete.dataset_size_100_rep_5.net___pearson_gtex_Whole.Blood_size_100_rep_5.net_pval_0.05.txt
## 6 diffanalysis_nodes_pearson_scipher_complete.dataset_size_120_rep_1.net___pearson_gtex_Whole.Blood_size_120_rep_1.net_pval_0.05.txt
##   size rep_D rep_N
## 1  100     1     1
## 2  100     2     2
## 3  100     3     3
## 4  100     4     4
## 5  100     5     5
## 6  120     1     1

Read the files of the differential co-expression analysis from CODINA classifying the nodes, and put them together in a single table:

pval_correction_field = "pval_Phi_Tilde.adj.fdr"
pval_threshold = 0.05
cols = c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N")
nodes_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
sizes_list = sort(unique(file_info_df$size))

for(x in seq(sizes_list)){
  size = sizes_list[x]
  #print(size)
  same_size_files_df = file_info_df %>% filter(size==!!size) %>% arrange(rep_D, rep_N)
  input_files = same_size_files_df$file_name
  # same_size_info_df = data.frame()
  for(input_file in input_files){
    r_D = (same_size_files_df %>% filter(file_name == input_file))$rep_D
    r_N = (same_size_files_df %>% filter(file_name == input_file))$rep_N
    individual_df = fread(paste(input_dir, input_file, sep="/"))
    node_results_filt_df = individual_df %>%
      select(Node, Phi_tilde, !!as.symbol(pval_correction_field)) %>% 
      unique() %>% 
      rename("pval" = !!as.symbol(pval_correction_field)) %>% 
      left_join(GDA_ra, by=c("Node"="HGNC_Symbol"))
    if(nrow(nodes_df) == 0){
      nodes_df = cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N))
    } else {
      nodes_df = nodes_df %>% full_join(cbind((node_results_filt_df %>% select("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval")), data.frame(size=size, rep_D=r_D, rep_N=r_N)), by=c("Node", "DiseaseName.no.sp.char", "Phi_tilde", "pval", "size", "rep_D", "rep_N"))
    }
  }
}

# Define Phi_name
nodes_df$Phi_name = ifelse(nodes_df$Phi_tilde == "a", "common", ifelse(nodes_df$Phi_tilde == "b.D", "different", ifelse(nodes_df$Phi_tilde == "g.D", "disease-specific", ifelse(nodes_df$Phi_tilde == "g.N", "normal-specific", "undefined"))))

head(nodes_df)
##       Node DiseaseName.no.sp.char Phi_tilde         pval size rep_D rep_N
## 1:     A2M                   <NA>       g.N 2.485035e-81   20     1     1
## 2: A3GALT2                   <NA>       g.N 2.215557e-07   20     1     1
## 3:    AAAS                   <NA>       g.N 1.645289e-29   20     1     1
## 4:    AACS                   <NA>       g.N 2.606093e-03   20     1     1
## 5:   AAGAB                   <NA>       g.D 1.194946e-52   20     1     1
## 6:    AAK1                   <NA>       g.D 6.865455e-19   20     1     1
##            Phi_name
## 1:  normal-specific
## 2:  normal-specific
## 3:  normal-specific
## 4:  normal-specific
## 5: disease-specific
## 6: disease-specific

We consider as “undefined” both the genes that are classified as U, but also the ones that are not significantly classified in any of the categories. To do so, we need create an empty table with all considered genes for all sizes and repetitions, and fill it with the data that we already have:

reps = sort(unique(paste(nodes_df$size, nodes_df$rep_D, nodes_df$rep_N, sep="-")))
nodes_empty_df = expand.grid(sort(unique(nodes_df$Node)), reps)
nodes_empty_df = nodes_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(nodes_empty_df) = c("Node", "size", "rep_D", "rep_N")
nodes_empty_df = nodes_empty_df %>% left_join((nodes_df %>% select(Node, DiseaseName.no.sp.char) %>% unique()), by="Node") %>% arrange(Node, size, rep_D, rep_N)
nodes_expanded_df = nodes_df %>% full_join(nodes_empty_df, by = c("Node", "DiseaseName.no.sp.char", "size", "rep_D", "rep_N")) %>% arrange(Node, size, rep_D)
nodes_expanded_df = nodes_expanded_df %>%
  mutate(Phi_tilde = replace(Phi_tilde, is.na(Phi_tilde), "U")) %>%
  mutate(Phi_name = replace(Phi_name, is.na(Phi_name), "undefined"))
nodes_expanded_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes.txt", sep="/")
nodes_expanded_df %>% fwrite(nodes_expanded_file)
head(nodes_expanded_df)
##        Node DiseaseName.no.sp.char Phi_tilde         pval size rep_D rep_N
## 1: A1BG-AS1                   <NA>         U           NA   20     1     1
## 2: A1BG-AS1                   <NA>         U           NA   20     2     2
## 3: A1BG-AS1                   <NA>         U 3.696587e-01   20     3     3
## 4: A1BG-AS1                   <NA>         U           NA   20     4     4
## 5: A1BG-AS1                   <NA>         U           NA   20     5     5
## 6: A1BG-AS1                   <NA>       g.N 4.971565e-50   40     1     1
##           Phi_name
## 1:       undefined
## 2:       undefined
## 3:       undefined
## 4:       undefined
## 5:       undefined
## 6: normal-specific

We count how many times each gene category appears in each sample size:

# Count for each gene and size the number of times that each category is predicted
nodes_counted_df = nodes_expanded_df %>% 
  group_by(Node, DiseaseName.no.sp.char, Phi_tilde, Phi_name, size) %>% 
  summarize(n_phi = n()) %>% 
  ungroup()  %>% 
  arrange(Node, size)
## `summarise()` has grouped output by 'Node', 'DiseaseName.no.sp.char',
## 'Phi_tilde', 'Phi_name'. You can override using the `.groups` argument.
nodes_counted_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes_counted.txt", sep="/")
nodes_counted_df %>% fwrite(nodes_counted_file)
head(nodes_counted_df)
## # A tibble: 6 × 6
##   Node     DiseaseName.no.sp.char Phi_tilde Phi_name          size n_phi
##   <chr>    <chr>                  <chr>     <chr>            <dbl> <int>
## 1 A1BG-AS1 <NA>                   U         undefined           20     5
## 2 A1BG-AS1 <NA>                   g.D       disease-specific    40     1
## 3 A1BG-AS1 <NA>                   g.N       normal-specific     40     4
## 4 A1BG-AS1 <NA>                   g.D       disease-specific    60     2
## 5 A1BG-AS1 <NA>                   g.N       normal-specific     60     3
## 6 A1BG-AS1 <NA>                   a         common              80     1

We select the gene category that appears more for in the different repetitions of each sample size:

# Select the for each gene and size the node type that occurs more times in the different repetitions
nodes_filtered_df = nodes_counted_df %>%
  group_by(Node, DiseaseName.no.sp.char, size) %>%
  filter(n_phi == max(n_phi)) %>% 
  arrange(Node, size) %>% 
  sample_n(1) %>% # Choose randomly cases in which there is the same number of node types 
  ungroup()
 
nodes_filtered_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_nodes_filtered.txt", sep="/")
nodes_filtered_df %>% fwrite(nodes_filtered_file)
head(nodes_filtered_df)
## # A tibble: 6 × 6
##   Node     DiseaseName.no.sp.char Phi_tilde Phi_name          size n_phi
##   <chr>    <chr>                  <chr>     <chr>            <dbl> <int>
## 1 A1BG-AS1 <NA>                   U         undefined           20     5
## 2 A1BG-AS1 <NA>                   g.N       normal-specific     40     4
## 3 A1BG-AS1 <NA>                   g.N       normal-specific     60     3
## 4 A1BG-AS1 <NA>                   g.D       disease-specific    80     3
## 5 A1BG-AS1 <NA>                   g.D       disease-specific   100     2
## 6 A1BG-AS1 <NA>                   g.D       disease-specific   120     2

We count the number of disease genes in each category for each size:

counts_categories_empty_df = expand.grid(sort(unique(nodes_expanded_df$Phi_name)), reps)
counts_categories_empty_df = counts_categories_empty_df %>% separate("Var2", into=c("size", "rep_D", "rep_N"), sep="-", convert=TRUE)
colnames(counts_categories_empty_df) = c("Phi_name", "size", "rep_D", "rep_N")
counts_categories_df = nodes_expanded_df %>% 
  filter(!(is.na(DiseaseName.no.sp.char))) %>% 
  group_by(Phi_name, size, rep_D, rep_N) %>% count() %>% rename("n_disease"="n") %>% arrange(size)
counts_categories_df = counts_categories_df %>% full_join(counts_categories_empty_df, by = c("Phi_name", "size", "rep_D", "rep_N")) %>% arrange(Phi_name, size, rep_D, rep_N)
counts_categories_df$n_disease = ifelse(is.na(counts_categories_df$n_disease), 0, counts_categories_df$n_disease)
counts_categories_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_category_counts.txt", sep="/")
counts_categories_df %>% fwrite(counts_categories_file)
head(counts_categories_df)
## # A tibble: 6 × 5
## # Groups:   Phi_name, size, rep_D, rep_N [6]
##   Phi_name  size rep_D rep_N n_disease
##   <chr>    <dbl> <dbl> <dbl>     <dbl>
## 1 common      20     1     1         0
## 2 common      20     2     2         0
## 3 common      20     3     3         0
## 4 common      20     4     4         0
## 5 common      20     5     5         0
## 6 common      40     1     1         1

We plot the number of disease genes in each category:

custom_palette = c(
                   "#619CFF", #common
                   "#C77CFF", #different
                   "#F8766D", #disease-specific 
                   "#00BA38", #normal-specific
                   "#808080" #undefined
                   )

counts_categories_df %>% 
  group_by(Phi_name, size) %>%
  summarize(mean_disease = mean(n_disease), sd_disease = sd(n_disease), max_disease=max(n_disease), min_disease=min(n_disease)) %>%
  ggplot(aes(x = size, y = mean_disease, fill = Phi_name)) + 
  geom_line(aes(col = Phi_name)) +
  geom_ribbon(aes(ymin=min_disease, ymax=max_disease), alpha=0.2) +
  xlab("Number of samples") +
  ylab("Number of disease genes") +
  scale_fill_manual(values = custom_palette) +
  scale_color_manual(values = custom_palette) +
  guides(col=guide_legend(title="Gene category"), fill=guide_legend(title="Gene category")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.

plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_counts_disease_genes.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

We create the alluvium plot showing the flow between gene categories across sample size:

nodes_filtered_df$Phi_name <- as.factor(nodes_filtered_df$Phi_name)
nodes_filtered_df$size = as.character(nodes_filtered_df$size)
levels(nodes_filtered_df$Phi_name) = sort(unique(nodes_filtered_df$Phi_name))
nodes_filtered_df$size <- factor(nodes_filtered_df$size, levels=as.character(sort(unique(as.integer(nodes_filtered_df$size)))))

custom_palette = c(
                   "#619CFF", #common
                   "#F8766D", #disease-specific 
                   "#00BA38", #normal-specific
                   "#808080", #undefined
                   "#C77CFF" #different
                   )
plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_change_disease_genes.png", sep="/")
nodes_filtered_df %>% 
  filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
  filter(!(is.na(DiseaseName.no.sp.char))) %>%
  ggplot(aes(x = size, stratum = Phi_name, alluvium = Node, 
             fill = Phi_name, label = Phi_name)) +
  scale_fill_manual(values = custom_palette) +
  scale_color_manual(values = custom_palette) +
  geom_flow(stat = "alluvium", lode.guidance = "frontback") +
  geom_stratum() +
  xlab("Number of samples") +
  ylab("Number of disease genes") +
  guides(fill=guide_legend(title="Gene category")) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), 
        axis.title = element_text(size = 14, face="bold"), 
        axis.text = element_text(size = 12), 
        #legend.position = "bottom",
        legend.text = element_text(size = 12), 
        legend.title=element_text(size=14, face="bold"))

ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)

We check the behavior specific genes associated with rheumatoid arthritis:

# Get only strong evidences: "TNF", "IL6ST", "IL6R", "PTPN22", "HOTAIR", "MALAT1"
nodes_to_follow = c("IL6ST", "IL6R", "PTPN22", "HOTAIR", "MALAT1") #### !!!! MISSING TNF !!!!
#nodes_to_follow = c("HLA-DRB1", "HLA-DPB1", "IRF5", "PTPN22", "RBPJ", "RUNX1", "STAT4", "TNF")
for (important_node in nodes_to_follow){
  # Filter results by important node
  important_df = nodes_expanded_df %>% 
    filter(size %in% c(20, 80, 140, 200, 260, 320, 380, 440)) %>%
    filter(!(is.na(DiseaseName.no.sp.char))) %>%
    filter(Node == important_node) %>%
    select(Phi_name, size, rep_D, rep_N) %>%
    group_by(Phi_name, size) %>% 
    summarize(n_evidences = n()) %>% 
    ungroup()
  # Define levels
  important_df$Phi_name <- as.factor(important_df$Phi_name)
  important_df$size = as.character(important_df$size)
  levels(important_df$Phi_name) = sort(unique(important_df$Phi_name))
  important_df$size <- factor(important_df$size, levels=as.character(sort(unique(as.integer(important_df$size)))))
  # Define palette
  if ((length(levels(important_df$Phi_name)) == length(c("disease-specific", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("disease-specific", "normal-specific", "undefined")))){
    personalized_palette = c("#F8766D", "#00BA38", "#808080")
  } else if ((length(levels(important_df$Phi_name)) == length(c("normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("normal-specific", "undefined")))){
    personalized_palette = c("#00BA38", "#808080")
  } else if ((length(levels(important_df$Phi_name)) == length(c("common", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("common", "normal-specific", "undefined")))){
    personalized_palette = c("#619CFF", "#00BA38", "#808080")
  } else if ((length(levels(important_df$Phi_name)) == length(c("normal-specific"))) && (all(levels(important_df$Phi_name) == c("normal-specific")))){
    personalized_palette = c("#00BA38")
  } else if ((length(levels(important_df$Phi_name)) == length(c("common", "disease-specific", "normal-specific", "undefined"))) && (all(levels(important_df$Phi_name) == c("common", "disease-specific", "normal-specific", "undefined")))){
    personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080")
  } else {
    personalized_palette = c("#619CFF", "#F8766D", "#00BA38", "#808080", "#C77CFF")
  }
  # Plot
  important_node_plot = important_df %>%
    ggplot(aes(fill=Phi_name, y=n_evidences, x=size)) + 
      geom_bar(position="fill", stat="identity", width = 0.7, col="black") +
      #geom_bar(position = position_dodge(), stat="identity", col="black", width = 0.6) +
    xlab("Number of samples") +
    ylab("Fraction of evidences") +
    scale_fill_manual(values = personalized_palette) +
    guides(fill=guide_legend(title="Gene category")) +
    theme_linedraw() +
    theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
  plot_file = paste(plots_dir, paste("codina_scipher_ra_gtex_wholeblood_important_", important_node, ".png", sep=""), sep="/")
  ggsave(
    plot_file,
    plot=important_node_plot,
    dpi = 1200,
    width = 9000,
    height = 6000,
    units = c("px")
  )
  print(important_node_plot)
}
## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.`summarise()` has grouped output by 'Phi_name'. You can
## override using the `.groups` argument.

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.

## `summarise()` has grouped output by 'Phi_name'. You can override using the
## `.groups` argument.

Calculate the fraction of unchanged / changed genes in each category for each sample size:

# Define output table
cols = c("Node", "Phi_name.prev", "size.prev", "Phi_name.curr", "size.curr")
change_of_category_df = data.frame(matrix(ncol=length(cols),nrow=0, dimnames=list(NULL, cols)))
# Remove levels
nodes_filtered_df$size = as.numeric(as.character(nodes_filtered_df$size))
# For each size, compare nodes with the ones of previous size
for (x in seq(sizes_list)){
  size = sizes_list[x]
  nodes_filtered_by_selected_size = nodes_filtered_df %>%
    filter((!(is.na(DiseaseName.no.sp.char))) & (size == !!size)) %>%
    select(Node, Phi_name, size)
  if (x == 1){
    previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
  } else {
    nodes_filtered_compared_by_sizes = previous_nodes_filtered %>%
      inner_join(nodes_filtered_by_selected_size, by=("Node")) %>%
      rename(c("Phi_name.prev"="Phi_name.x", "Phi_name.curr"="Phi_name.y", "size.prev"="size.x", "size.curr"="size.y"))
    change_of_category_df = rbind(change_of_category_df, nodes_filtered_compared_by_sizes)
    previous_nodes_filtered = data.frame(nodes_filtered_by_selected_size)
  }
}

# Detect changes
change_of_category_df$transition_type = ifelse(change_of_category_df$Phi_name.prev == change_of_category_df$Phi_name.curr, "stable", "change")
change_of_category_df$transition_name = paste(change_of_category_df$Phi_name.prev, change_of_category_df$Phi_name.curr, sep=" / ")

# Print table
change_of_category_file = paste(tables_dir, "codina_scipher_ra_gtex_wholeblood_changes.txt", sep="/")
change_of_category_df %>% fwrite(change_of_category_file)
head(change_of_category_df)
##    Node   Phi_name.prev size.prev    Phi_name.curr size.curr transition_type
## 1 ABCB1 normal-specific        20  normal-specific        40          stable
## 2 ABCC2       undefined        20  normal-specific        40          change
## 3 ABCC3       undefined        20        undefined        40          stable
## 4 ABCC4       undefined        20 disease-specific        40          change
## 5 ABCC5       undefined        20  normal-specific        40          change
## 6 ABCG2       undefined        20 disease-specific        40          change
##                     transition_name
## 1 normal-specific / normal-specific
## 2       undefined / normal-specific
## 3             undefined / undefined
## 4      undefined / disease-specific
## 5       undefined / normal-specific
## 6      undefined / disease-specific
total_stable_change_genes_df = change_of_category_df %>%
  group_by(size.prev) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>% 
  group_by(size.prev, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'size.prev', 'transition_type'. You can
## override using the `.groups` argument.
total_stable_change_genes_df$Phi_name.prev = "all"

categories_stable_change_genes_df = change_of_category_df %>%
  group_by(size.prev) %>% 
  mutate(n_total = n()) %>%
  ungroup() %>%
  filter(transition_type == "stable") %>%
  select(Node, Phi_name.prev, size.prev, transition_type, n_total) %>% 
  group_by(Phi_name.prev, size.prev, transition_type, n_total) %>% 
  summarize(n_genes = n()) %>% 
  ungroup() %>%
  mutate(frac_genes = n_genes / n_total)
## `summarise()` has grouped output by 'Phi_name.prev', 'size.prev',
## 'transition_type'. You can override using the `.groups` argument.
rbind(total_stable_change_genes_df, categories_stable_change_genes_df) %>%
  filter(transition_type == "stable") %>%
  ggplot(aes(x = size.prev, y = frac_genes, col = Phi_name.prev)) + 
  geom_line(aes(size = Phi_name.prev)) +
  xlab("Number of samples") +
  ylab("Fraction of stable genes") +
  scale_size_manual(values = c("all" = 1, "common" = 0.5, "disease-specific" = 0.5, "normal-specific" = 0.5, "undefined" = 0.5)) +
  scale_color_manual(values = c("#CD9600", "#619CFF", "#F8766D", "#00BA38", "#808080")) +
  guides(col=guide_legend(title="Gene category"), size=FALSE) +
  theme_linedraw() +
  theme(plot.title =  element_text(size = 17, face="bold"), axis.title = element_text(size = 14, face="bold"), axis.text = element_text(size = 12), legend.text = element_text(size = 12), legend.title=element_text(size=14, face="bold"))
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

plot_file = paste(plots_dir, "codina_scipher_ra_gtex_wholeblood_stable_genes.png", sep="/")
ggsave(
  plot_file,
  dpi = 1200,
  width = 9000,
  height = 6000,
  units = c("px")
)